In [1]:
import pandas as pd
import numpy as np
import scipy.sparse as sp
import random

comment data 가져오기 및 전처리


In [2]:
episode_comment = pd.read_csv("data/webnovel/episode_comments.csv", index_col=0, encoding="cp949")


---------------------------------------------------------------------------
OSError                                   Traceback (most recent call last)
<ipython-input-2-105bfe4cfe84> in <module>()
----> 1 episode_comment = pd.read_csv("data/webnovel/episode_comments.csv", index_col=0, encoding="cp949")

C:\Users\kms\Anaconda3\lib\site-packages\pandas\io\parsers.py in parser_f(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, escapechar, comment, encoding, dialect, tupleize_cols, error_bad_lines, warn_bad_lines, skip_footer, doublequote, delim_whitespace, as_recarray, compact_ints, use_unsigned, low_memory, buffer_lines, memory_map, float_precision)
    560                     skip_blank_lines=skip_blank_lines)
    561 
--> 562         return _read(filepath_or_buffer, kwds)
    563 
    564     parser_f.__name__ = name

C:\Users\kms\Anaconda3\lib\site-packages\pandas\io\parsers.py in _read(filepath_or_buffer, kwds)
    313 
    314     # Create the parser.
--> 315     parser = TextFileReader(filepath_or_buffer, **kwds)
    316 
    317     if (nrows is not None) and (chunksize is not None):

C:\Users\kms\Anaconda3\lib\site-packages\pandas\io\parsers.py in __init__(self, f, engine, **kwds)
    643             self.options['has_index_names'] = kwds['has_index_names']
    644 
--> 645         self._make_engine(self.engine)
    646 
    647     def close(self):

C:\Users\kms\Anaconda3\lib\site-packages\pandas\io\parsers.py in _make_engine(self, engine)
    797     def _make_engine(self, engine='c'):
    798         if engine == 'c':
--> 799             self._engine = CParserWrapper(self.f, **self.options)
    800         else:
    801             if engine == 'python':

C:\Users\kms\Anaconda3\lib\site-packages\pandas\io\parsers.py in __init__(self, src, **kwds)
   1211         kwds['allow_leading_cols'] = self.index_col is not False
   1212 
-> 1213         self._reader = _parser.TextReader(src, **kwds)
   1214 
   1215         # XXX

pandas\parser.pyx in pandas.parser.TextReader.__cinit__ (pandas\parser.c:3427)()

pandas\parser.pyx in pandas.parser.TextReader._setup_parser_source (pandas\parser.c:6861)()

OSError: File b'data/webnovel/episode_comments.csv' does not exist

In [ ]:
episode_comment["ID"] = episode_comment["object_id"].apply(lambda x: x.split("-")[0])
episode_comment["volume"] = episode_comment["object_id"].apply(lambda x: x.split("-")[1]).astype("int")
episode_comment["writer_nickname"].fillna("", inplace=True)

def make_user_id(i):
    if episode_comment["writer_nickname"].loc[i] == "":
        return episode_comment["writer_ip"].loc[i] + episode_comment["writer_id"].loc[i]
    else:
        return episode_comment["writer_nickname"].loc[i] + episode_comment["writer_id"].loc[i]

user_id = [
    make_user_id(i)
    for i in range(len(episode_comment))
]
    
    
episode_comment["user_id"] = user_id
episode_comment.drop(
    [
        "contents", 
        "down_count", 
        "modified_ymdt", 
        "registered_ymdt",
        "ticket",
        "up_count",
        "writer_ip",
        "writer_id",
        "writer_nickname",
        "writer_profile_type",
        "object_id",
    ],
    axis=1,
    inplace=True
)

episode_comment.head()

In [238]:
main_comment = pd.read_csv("data/webnovel/main_comments.csv", index_col=0, encoding="cp949")

In [239]:
main_comment["ID"] = main_comment["object_id"].apply(lambda x: x.split("-")[1])
main_comment["volume"] = 0
main_comment["writer_nickname"].fillna("", inplace=True)

def make_user_id(i):
    if main_comment["writer_nickname"].loc[i] == "":
        return main_comment["writer_ip"].loc[i] + main_comment["writer_id"].loc[i]
    else:
        return main_comment["writer_nickname"].loc[i] + main_comment["writer_id"].loc[i]

user_id = [
    make_user_id(i)
    for i in range(len(main_comment))
]
    
    
main_comment["user_id"] = user_id
main_comment.drop(
    [
        "contents", 
        "down_count", 
        "modified_ymdt", 
        "registered_ymdt",
        "ticket",
        "up_count",
        "writer_ip",
        "writer_id",
        "writer_nickname",
        "writer_profile_type",
        "object_id",
    ],
    axis=1,
    inplace=True
)

main_comment.head()


Out[239]:
is_facebook is_twitter mobile_yn ID volume user_id
0 False False Y 466391 0 오지선chan****
1 False False Y 466391 0 뚱냥이cckc****
2 False False Y 466391 0 또자123hhyy****
3 False False Y 466391 0 솔방울jeon****
4 False False Y 466391 0 귀여미피그jian****

user dataframe 만들기


In [269]:
user_df = pd.concat([episode_comment, main_comment]).groupby(["user_id", "ID"], as_index=False).agg({"volume":np.size})

In [270]:
len(user_df)


Out[270]:
355861

In [271]:
df = pd.read_csv("data/webnovel/main_df.csv", encoding="cp949", index_col=0)
df["ID"] = df["ID"].astype("str")

In [243]:
df = user_df.merge(df, on="ID")[["user_id", "genre", "volume"]].drop_duplicates()

In [244]:
len(df["user_id"].unique())


Out[244]:
169464

In [245]:
romance = df[df["genre"] == 101]

In [246]:
no_romance = df[df["genre"] != 101]

In [247]:
len(romance.merge(no_romance, on="user_id"))


Out[247]:
126631

user, book 인덱스 및 처리


In [248]:
user_size = len(user_df["user_id"].unique())

In [249]:
users = user_df["user_id"].unique()

In [250]:
users_index = {
    user:index
    for index, user in enumerate(users)
}

In [266]:
book_df = pd.read_csv("data/webnovel/main_df.csv", encoding="cp949", index_col=0)

In [264]:
book_size = len(book_df.ID.unique())

In [253]:
books = book_df.ID.unique()

In [262]:
len(books)


Out[262]:
241

In [254]:
books_index = {
    str(book):index
    for index, book in enumerate(books)
}

In [255]:
user_df["book_index"] = user_df["ID"].apply(lambda x: books_index[x])

In [256]:
user_df["user_index"] = user_df["user_id"].apply(lambda x: users_index[x])

user * book matrix 만들기


In [257]:
empty_matrix = np.zeros((user_size, book_size))

In [259]:
for index, i in user_df.iterrows():
    empty_matrix[i["user_index"], i["book_index"]] = i["volume"]

In [265]:
user_book_matrix = pd.DataFrame(empty_matrix, columns=books)


---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
C:\Users\kms\Anaconda3\lib\site-packages\pandas\core\internals.py in create_block_manager_from_blocks(blocks, axes)
   3983                 blocks = [make_block(values=blocks[0],
-> 3984                                      placement=slice(0, len(axes[0])))]
   3985 

C:\Users\kms\Anaconda3\lib\site-packages\pandas\core\internals.py in make_block(values, placement, klass, ndim, dtype, fastpath)
   2517 
-> 2518     return klass(values, ndim=ndim, fastpath=fastpath, placement=placement)
   2519 

C:\Users\kms\Anaconda3\lib\site-packages\pandas\core\internals.py in __init__(self, values, placement, ndim, fastpath)
     89                              'implies %d' % (len(self.values),
---> 90                                              len(self.mgr_locs)))
     91 

ValueError: Wrong number of items passed 245, placement implies 241

During handling of the above exception, another exception occurred:

ValueError                                Traceback (most recent call last)
<ipython-input-265-98765acef094> in <module>()
----> 1 user_book_matrix = pd.DataFrame(empty_matrix, columns=books)

C:\Users\kms\Anaconda3\lib\site-packages\pandas\core\frame.py in __init__(self, data, index, columns, dtype, copy)
    253             else:
    254                 mgr = self._init_ndarray(data, index, columns, dtype=dtype,
--> 255                                          copy=copy)
    256         elif isinstance(data, (list, types.GeneratorType)):
    257             if isinstance(data, types.GeneratorType):

C:\Users\kms\Anaconda3\lib\site-packages\pandas\core\frame.py in _init_ndarray(self, values, index, columns, dtype, copy)
    430             values = _possibly_infer_to_datetimelike(values)
    431 
--> 432         return create_block_manager_from_blocks([values], [columns, index])
    433 
    434     @property

C:\Users\kms\Anaconda3\lib\site-packages\pandas\core\internals.py in create_block_manager_from_blocks(blocks, axes)
   3991         blocks = [getattr(b, 'values', b) for b in blocks]
   3992         tot_items = sum(b.shape[0] for b in blocks)
-> 3993         construction_error(tot_items, blocks[0].shape[1:], axes, e)
   3994 
   3995 

C:\Users\kms\Anaconda3\lib\site-packages\pandas\core\internals.py in construction_error(tot_items, block_shape, axes, e)
   3968         raise ValueError("Empty data passed with indices specified.")
   3969     raise ValueError("Shape of passed values is {0}, indices imply {1}".format(
-> 3970         passed, implied))
   3971 
   3972 

ValueError: Shape of passed values is (245, 169464), indices imply (241, 169464)

In [22]:
user_book_matrix.index = users

In [23]:
user_book_matrix


Out[23]:
466391 398090 514809 523286 505096 552533 514807 466374 483047 514808 ... 505107 538923 466392 538925 433837 538926 398095 272505 252942 291783
까kim_**** 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
(주)백송종합건설bs12**** 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
(주)태현유니크thyu**** 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
00 처음 00ab93**** 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
000000000000wkdg**** 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
0000002055**** 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
000000altj**** 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
000000arom**** 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
000000bacc**** 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
000000chlw**** 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0
000000deka**** 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
000000demi**** 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
000000ds2a**** 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
000000eoeo**** 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
000000evil**** 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
000000hang**** 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
000000ibg0**** 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
000000jeni**** 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
000000jiyo**** 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
000000jkw8**** 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
000000jset**** 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
000000kae4**** 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
000000khs4**** 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
000000ki34**** 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
000000kimc**** 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
000000lee3**** 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
000000mark**** 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
000000mizl**** 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
000000naja**** 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
000000peri**** 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
힘멜dj64**** 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
힘빠진토마토sych**** 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
힘센아이dud3**** 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
힘센토끼sks4**** 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
힘쌤이thsa**** 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
힘쎈 오리flyd**** 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
힘을내요비버킴bbas**** 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
힘을내자rjad**** 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
힘짱요digh**** 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
힘차게달려라111sos5**** 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
힘차게살자011t**** 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
힘찬맘sigo**** 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
힘찬이엄마kono**** 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
힙과 허벅지kim_**** 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
힙합yeon**** 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
힙합소녀gana**** 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
힙합키드ghkd**** 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 1.0
힛kyou**** 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
힛츠wns7**** 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
힛큭jiho**** 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
힛키ansr**** 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
힝pink**** 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
힝구0w0o**** 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
힝이goss**** 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
힝짓work**** 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
힝콩muni**** 1.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
힝행tidh**** 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
힝훙헹홍tjdu**** 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
힝훙헹훙5gks**** 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
힝힝namo**** 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0

169464 rows × 241 columns

user * user cosine similarity 매트릭스 만들기

  • 1 권 169464 명 1분 59초
  • 2 권 57555 명 40.6초
  • 3 권 31808 명 22.4초
  • 4 권 20470 명 14.5초
  • 5 권 14393 명 10.2초
  • 6 권 10630 명 7.58초
  • 7 권 8074 명 5.8초
  • 8 권 6306 명 4.54초
  • 9 권 4995 명 3.56초
  • 10 권 4052 명 2.91초

In [335]:
for i in range(15):
    print(i+1, "권 이상 읽은 사람은",len(user_book_matrix[user_book_matrix.sum(axis=1)>i]), "명 입니다.")


1 권 이상 읽은 사람은 169464 명 입니다.
2 권 이상 읽은 사람은 57555 명 입니다.
3 권 이상 읽은 사람은 31808 명 입니다.
4 권 이상 읽은 사람은 20470 명 입니다.
5 권 이상 읽은 사람은 14393 명 입니다.
6 권 이상 읽은 사람은 10630 명 입니다.
7 권 이상 읽은 사람은 8074 명 입니다.
8 권 이상 읽은 사람은 6306 명 입니다.
9 권 이상 읽은 사람은 4995 명 입니다.
10 권 이상 읽은 사람은 4052 명 입니다.
11 권 이상 읽은 사람은 3330 명 입니다.
12 권 이상 읽은 사람은 2811 명 입니다.
13 권 이상 읽은 사람은 2383 명 입니다.
14 권 이상 읽은 사람은 2036 명 입니다.
15 권 이상 읽은 사람은 1754 명 입니다.

In [179]:
from scipy.spatial import distance
def cosine_distance(a, b):
    return 1 - distance.cosine(a, b)

In [216]:
def make_score(books):
    """
    MAE 스코어 계산
    """
    user_books_matrix_two = user_book_matrix[user_book_matrix.sum(axis=1)>books]
    empty_matrix = np.zeros((50, len(user_books_matrix_two))) # 샘플 10명
    users_two_index = user_books_matrix_two.index
    user_books_matrix_two.index = range(len(user_books_matrix_two))

    for index_1, i in user_books_matrix_two[:10].iterrows():
        for index_2, j in user_books_matrix_two[index_1+1:].iterrows():
            empty_matrix[index_1, index_2] = cosine_distance(i, j)

    score_list = []
    for i in range(10):
        ID_index = []
        while len(ID_index) < 11:
            if empty_matrix[i].argmax() >= 1:
                empty_matrix[i, empty_matrix[i].argmax()] = 0
            else:
                ID_index.append(empty_matrix[i].argmax())
                empty_matrix[i, empty_matrix[i].argmax()] = 0
        data = user_books_matrix_two.loc[i]
        predict = user_books_matrix_two.loc[ID_index].mean()
        score = data[data > 0] - predict[data > 0]
        score_list.append(np.absolute(score).sum()/len(score))
    print(np.array(score_list).mean())
    return np.array(score_list).mean()

In [218]:
scores = list(map(make_score, [0,1,2,3,4,5,6,7,8,9]))


0.833333333333
0.733333333333
0.775
0.875
0.846388888889
0.838611111111
0.855277777778
0.86
0.850909090909
0.884242424242

In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [52]:
user_df[user_df["user_id"] == users_two_index[empty_matrix[0].argmax()]]


Out[52]:
user_id ID volume book_index user_index
82131 Ronevebala**** 390140 3 54 47957
82132 Ronevebala**** 409350 65 138 47957
82133 Ronevebala**** 466391 54 0 47957
82134 Ronevebala**** 473137 44 90 47957
82135 Ronevebala**** 538919 1 21 47957

In [53]:
user_df[user_df["user_id"] == users_two_index[0]]


Out[53]:
user_id ID volume book_index user_index
12 000000bacc**** 252936 9 95 8
13 000000bacc**** 307729 10 60 8
14 000000bacc**** 390140 4 54 8
15 000000bacc**** 466391 77 0 8
16 000000bacc**** 523290 27 25 8
17 000000bacc**** 538919 14 21 8
18 000000bacc**** 545432 4 22 8

In [42]:
user_books_matrix_two


Out[42]:
466391 398090 514809 523286 505096 552533 514807 466374 483047 514808 ... 505107 538923 466392 538925 433837 538926 398095 272505 252942 291783
0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0
1 1.0 0.0 1.0 1.0 1.0 1.0 0.0 1.0 0.0 1.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
2 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
3 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 1.0 0.0 1.0 1.0 0.0 0.0 0.0 0.0
4 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
5 1.0 1.0 0.0 0.0 1.0 1.0 0.0 1.0 1.0 0.0 ... 1.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0
6 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
7 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
8 0.0 0.0 1.0 0.0 1.0 0.0 0.0 0.0 1.0 0.0 ... 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
9 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
10 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0
11 0.0 0.0 1.0 0.0 1.0 0.0 0.0 0.0 0.0 1.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
12 1.0 1.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 1.0 ... 1.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0
13 0.0 1.0 1.0 0.0 1.0 0.0 0.0 1.0 0.0 1.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
14 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 1.0 1.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
15 1.0 1.0 0.0 1.0 0.0 0.0 0.0 1.0 1.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0
16 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0
17 0.0 1.0 1.0 1.0 1.0 1.0 1.0 0.0 1.0 1.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
18 1.0 0.0 0.0 1.0 1.0 0.0 1.0 1.0 1.0 1.0 ... 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0
19 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
20 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
21 0.0 1.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0 1.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
22 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
23 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0
24 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
25 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
26 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
27 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
28 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
29 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
4022 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
4023 0.0 0.0 0.0 1.0 0.0 1.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0
4024 1.0 1.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
4025 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
4026 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 1.0 0.0 1.0 0.0 0.0 0.0
4027 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
4028 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
4029 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
4030 1.0 1.0 1.0 0.0 1.0 1.0 1.0 1.0 0.0 1.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
4031 1.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
4032 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0
4033 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
4034 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
4035 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
4036 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
4037 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
4038 0.0 0.0 1.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 ... 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
4039 1.0 1.0 0.0 1.0 1.0 1.0 0.0 1.0 0.0 0.0 ... 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
4040 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 1.0 1.0 0.0 0.0
4041 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
4042 1.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
4043 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
4044 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
4045 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
4046 1.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
4047 1.0 1.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0
4048 0.0 1.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0
4049 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
4050 1.0 1.0 1.0 0.0 0.0 0.0 1.0 1.0 0.0 1.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
4051 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0

4052 rows × 241 columns


In [ ]: